9 Differential Expression

We then compare the protein expression between strains for each pairwise comparison (28 unique pairwise combinations)

To detect differentially expressed genes, we use the limma analysis on normalized protein expression.

9.1 Volcano plots

First, we can look at the volcano plots (log-foldchange vs qvalue) for each unique pairwise comparison.

#ind_na_rows  = find_na_rows(int_norm,as.indices = T)
df_imputed = tibble( uniprot = int_norm_ids,
                     is_imputed = (rowSums( is.na(int_norm))>0)* 1, 
                     imputed = factor(is_imputed, levels = c(0,1), labels = c("not", "is_imputed")))
volcPlot(INPUT=int_bpca, IMPUTED=df_imputed, MIN_LFC=2, MIN_PVAL=0.01, WHICH='both', TOPN = 20, plot = F, use_plotly = T)
#> Joining, by = c("ID", "pValue", "qValue", "EffectSize",
#> "comparison", "sig", "log10_qvalue", "SGD", "ORF",
#> "UNIPROT", "GENENAME", "is_imputed", "imputed")

9.2 Differentially expressed genes

# Without NA
volcano_data =  get_volcano_data(input_data=df_int_bpca, min_lfc=2, min_pval=0.01, topn = 50) %>%
                bind_rows() %>% as_tibble()
#> selecting both up and down regulated genes...
dfe_nona = subset(volcano_data,sig!='Non significant')
N_DFE_NONA = nrow(dfe_nona)
NPROT_DFE_NONA =  n_distinct(dfe_nona$ID)
R_NONA = N_DFE_NONA/NPROT_DFE_NONA %>% round(0)
# With imputed expression to replace NA
dfe = get_dfe(INPUT=df_int_bpca, WHICH='both',MIN_LFC=2, MIN_PVAL=0.01, TOPN = 20, 
              count_up=T, count_down=T)
#> Joining, by = "ID"
#> Joining, by = "ID"

gene_diffexp = unique(dfe$ID)
gene_sameexp = unique(setdiff(volcano_data$ID,dfe$ID) )

df_int_bpca = df_int_bpca %>% mutate(is_regulated = uniprot %in% gene_diffexp)

sc_abundance = load.abundance()
#> REF: B. Ho et al., 2018, Cell Systems
#> Unification of Protein Abundance Datasets Yields a Quantitative Saccharomyces cerevisiae Proteome
strain_exp= df_int_bpca %>%  group_by(uniprot) %>% mutate(mean_strains = mean(c_across(all_of(toupper(samples)))) )

DD = volcano_data %>% group_by(ID) %>% 
      summarize(var_lfc = var(EffectSize), mean_lfc = mean(EffectSize)) %>%
      left_join(evo_yeast, by=c('ID'='UNIPROT')) %>% 
      dplyr::left_join(sc_abundance, by=c('orf')) %>% 
      left_join(strain_exp,by=c('ID'='uniprot')) %>%
      mutate(sig = ID %in% dfe_nona$ID) %>% 
      mutate(f_snp.nt.y8 = n_snp.nt.y8/len.nt,
             f_snp.nt.yk11 = n_snp.nt.yk11/len.nt,
              f_snp.aa.y8 = n_snp.aa.y8/len.aa,
              f_snp.aa.yk11 = n_snp.aa.yk11/len.aa
             ) %>%
      left_join(gene_feature,by=c('ID'='UNIPROTKB'))
gene_numfeat =  DD %>% dplyr::select(1:5, where(is.numeric)) %>% relocate(where(is.character))

# spearman(DD$var_lfc,DD$mean_strains)
# spearman(DD$var_lfc,DD$f_snp)
# spearman(DD$var_lfc,DD$r4s.fungi)

# ggboxplot(DD,x="is_regulated",y="MPC") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="r4s.fungi") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="mean_strains") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="f_snp") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="f_snp.nt.y8") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="f_snp.nt.yk11") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="f_snp.aa.y8") + stat_compare_means()
# ggboxplot(DD,x="is_regulated",y="f_snp.aa.yk11") + stat_compare_means()

give.n <- function(x){
  return(c(y = median(x)*1.1, label = length(x))) 
  # experiment with the multiplier to find the perfect position
}
plots_POS_REG = list()
plots_NEG_REG = list()
for(i in 7:ncol(gene_numfeat)){
  feat = colnames(gene_numfeat)[i]
  dd = DD %>% dplyr::select(all_of(c('ID','orf','GNAME','HAS_ORTHOLOG','len_ref','is_regulated',feat,'MPC','r4s.yk11','r4s.fungi',toupper(samples))))
  s1 = dd[dd$is_regulated,feat] %>% drop_na %>% pull(feat) %>% as.numeric
  s2 = dd[!dd$is_regulated,feat]%>% drop_na %>%  pull(feat) %>% as.numeric
  w = wilcox.test(x=s1,y=s2)
  if(w$p.value<0.01){
    if(median(s1) < median(s2)){
       plots_POS_REG[[feat]] = ggboxplot(dd,x="is_regulated",y=feat) + stat_compare_means() +
                               stat_summary(fun.data = give.n, geom = "text", fun.y = median)
    }else{
      plots_NEG_REG[[feat]] = ggboxplot(dd,x="is_regulated",y=feat) + stat_compare_means() + 
                              stat_summary(fun.data = give.n, geom = "text", fun.y = median)
    }
  }
}

# length(plots_NEG_REG)
# length(plots_POS_REG)

# names(plots_NEG_REG)
# names(plots_POS_REG)

# library(gridExtra)
# pdf("~/Desktop/plots-features-negative_regulated.pdf", onefile = TRUE)
# for (i in seq(length(plots_NEG_REG))) {
#   plot(plots_NEG_REG[[i]])  
# }
# dev.off()


# library(gridExtra)
# pdf("~/Desktop/plots-features-positve_regulated.pdf", onefile = TRUE)
# for (i in seq(length(plots_POS_REG))) {
#   plot(plots_POS_REG[[i]])  
# }
# dev.off()

# tmp[,c("f_snp",'f_snp.aa.yk11')]
# spearman(tmp_sig$mean_strains,tmp_sig$r4s.fungi)
# quantile(tmp$EffectSize)
# 
# cor(tmp$var_lfc[tmp$sig])

  # ints(tmp$var_lfc[tmp$sig!=])
N_DFE_IMPUTE = nrow(dfe)
NPROT_DFE_IMPUTE =  n_distinct(dfe$ID)
R_IMPUTE= N_DFE_IMPUTE/NPROT_DFE_IMPUTE %>% round(0)

# get_dfe(int_norm, MIN_LFC=2, MIN_PVAL=0.01,  WHICH='both', TOPN = 20) %>% remove_rownames() %>% 
#   dplyr::left_join(sc_identifiers, by=c('ID'='UNIPROT'))

# Number of times a hit is differentially expressed
df_dfe = dfe %>% 
         left_join(janitor::tabyl(dfe,ID,sig)) %>%
         rename(uniprot=ID) %>% 
         group_by(uniprot,comparison) %>% 
          mutate( n_strains_up = sum(c_across(starts_with('up_')) !=0 ),
                n_strains_down = sum(c_across(starts_with('down_'))!=0)) %>%
        replace_na(list(n_strains_up=0,n_strains_down=0)) %>%
        left_join(evo_yeast, by=c('uniprot'='UNIPROT')) %>% 
        relocate(uniprot,GENENAME,sig,pValue,qValue,EffectSize,comparison,
                 Downregulated,Upregulated,n_strains_up,n_strains_down,orf)
#> Joining, by = "ID"
  

df_dfe_annot = df_dfe %>%
          left_join(sc_annotation_orf,by=c('uniprot'='UNIPROT')) %>%
  mutate(uniprot_link = paste0("<a href='https://www.uniprot.org/uniprot/",uniprot,"'>",uniprot,"</a>"),
         sgd_link = paste0("<a href='https://www.yeastgenome.org/locus/",SGD,"'>",SGD,"</a>"),
         regulated = Downregulated+Upregulated) %>% 
  dplyr::relocate(uniprot,uniprot_link,sgd_link,regulated,Downregulated,Upregulated, 
                  GENENAME,ORF,PNAME,'FUNCTION','BIOPROCESS_all','ORTHO','OTHER')

We get 412 genes differentially expressed when excluding genes with missing expression in any samples.

On average, each gene is detected in 4.118932 unique pairwise strain comparison.

After imputation of missing expression with bpca (Bayesian missing value imputation), we get 0 more genes differentially expressed (n=412).

On average, each gene is detected in 4.118932 unique pairwise strain comparison.

The following table shows the list of differentially expressed genes across all unique pairwise comparison, with annotations data and conservation/snp information.

library(kableExtra)
library(formattable)
library(DT)

  # formattable(
  #   list(
  #     `Downregulated` = color_tile("white", "red"),
  #     `Upregulated` = color_tile("white", "blue"),
  #     `regulated` = color_tile("white", "gray")
  #   )
  # ) %>% 

ft_dt = DT::datatable(df_dfe_annot,
        options = list(
            paging = TRUE, pageLength = 20,  ## paginate the output and #rows for each page
            scrollY = TRUE,   ## enable scrolling on X/Y axis
            autoWidth = TRUE, ## use smart column width handling
            server = FALSE,   ## use client-side processing
            dom = 'Bfrtip', buttons = list('csv', 'excel', list(extend = 'colvis')),
            fixedColumns = list(leftColumns = 1),
            columnDefs = list(list(width = '50px', visible=TRUE, targets = "_all"))
          ),
  extensions = c('FixedHeader','FixedColumns','Buttons'),
  selection = 'single',           ## enable selection of a single row
  filter = 'top',              ## include column filters at the bottom
  rownames = FALSE,               ## don't show row numbers/names
  width = NULL, 
  height = NULL,
  caption = NULL
) %>% 
   formatStyle(columns = 1:30, target= 'row',lineHeight='100%', `font-size` = '12px')

ft_dt

9.3 Heatmap of expression for differentially expressed genes


dat_scaled = int_scaled_strains %>% as.data.frame() %>% rownames_to_column('uniprot') 

dfe_exp = dat_scaled %>% dplyr::filter( uniprot %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('uniprot'='UNIPROT'))  %>% 
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,uniprot))%>%
          column_to_rownames(var = 'GENENAME') %>%
          dplyr::select(-ORF,-uniprot,-SGD)

p_dfe_exp=pheatmap::pheatmap(dfe_exp,
                             fontsize = 5,cellwidth = 5,cellheight =5,border_color = NA,treeheight_col = 10,
                             filename = here('plot','heatmap-exp.png'))
knitr::include_graphics('plot/heatmap-exp.png',auto_pdf = T)

9.4 Heatmap of expression differences


dfe_lfc = get_volcano_data(input_data=int_bpca, which='both',min_lfc=2, min_pval=0.01, topn = 20) %>% 
          bind_rows %>% as_tibble() %>%
          pivot_wider(id_cols=ID, names_from = 'comparison', values_from = 'EffectSize') %>% 
          dplyr::filter(ID %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('ID'='UNIPROT')) %>%
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,ID))%>%
          column_to_rownames('GENENAME')

dfe_lfc_mat = dfe_lfc %>% dplyr::select(-ORF,-ID,-SGD)
# Heatmap of differentially expressed genes
pheatmap::pheatmap(dfe_lfc_mat,fontsize = 5,cutree_rows = 10,cellwidth = 5,cellheight =5,border_color = NA,
                          treeheight_col = 10, filename = here('plot','heatmap-lfc.png')  )
knitr::include_graphics('plot/heatmap-lfc.png',auto_pdf = T)

9.5 Cluster genes based on their profile of differential expression strain-pairwise

# Transpose the matrix to calculate distance between experiments row-wise
d_strain <- dfe_lfc_mat %>% t() %>% dist(.,method = "euclidean", diag = FALSE, upper = FALSE)
# Calculate the distance between proteins row-wise 
d_prot <- dfe_lfc_mat %>% dist(.,method = "euclidean", diag = FALSE, upper= FALSE)

hc_prot = hclust(d_prot,method='ward.D2')
cl_prot = hc_prot %>% cutree(k=10)
library(dendextend)
dend_prot = as.dendrogram(hc_prot)
dend_prot = rotate(dend_prot,seq_along(cl_prot))
dend_prot <- color_branches(dend_prot, k=10)
dend_prot <- color_labels(dend_prot, k=10)

#labels_colors(dend) <-rainbow_hcl(3)[sort_levels_values( as.numeric(dend_prot[,5])[order.dendrogram(dend)] )]
dend_prot <- hang.dendrogram(dend_prot,hang_height=0.1)
dend_prot <- set(dend_prot, "labels_cex", 0.5)
# And plot:
par(mar = c(2,3,0,3))
plot(dend_prot,horiz =  TRUE,  nodePar = list(cex = .007))

#legend("topleft", legend = iris_species, fill = rainbow_hcl(10))
par(mar = rep(0,4))
circlize_dendrogram(dend_prot)
#> Loading required namespace: circlize